Code Chunk #1: Basic data set exploration, modification of column classes, basic tables and barplots.

##### Data Exploration: understanding titanic data --------------------

# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.
# On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with 
# an iceberg, killing 1502 out of 2224 passengers and crew.
# This sensational tragedy shocked the international community and led to better 
# safety regulations for ships.One of the reasons that the shipwreck led to such 
# loss of life was that there were not enough lifeboats for the passengers and crew. 
# Although there was some element of luck involved in surviving the sinking, 
# some groups of people such as women, children, and the upper-class 
# were more likely to survive than others.

# VARIABLE DESCRIPTIONS:

# PassengerID     Unique passenger identifier
# Survived        Survival (0 = No; 1 = Yes)
# Pclass          Passenger Class(1 = 1st; 2 = 2nd; 3 = 3rd) (Pclass is a proxy for socio-economic status (SES)
#                     1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower)
# Name            Name
# Sex             Sex
# Age             Age (Age is in Years; Fractional if Age less than One (1) If the Age is Estimated, it is in the form xx.5)
# Sibsp           Number of Siblings/Spouses Aboard
# Parch           Number of Parents/Children Aboard
# Ticket          Ticket Number
# Fare            Passenger Fare
# Cabin           Cabin
# Embarked        Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)


### ---------------------------------------------------------------------------

### Set up, data import and inspections

# Load packages after they have been installed.

library(psych)
library(scatterplot3d)
library(lattice)
library(MASS)

# Import a csv file
dir <- "~/R/IS_6482/MA1"

inputfile <- gettextf('%s/titanic.train.csv', dir)

titanic <- read.csv(file = inputfile, stringsAsFactors = FALSE)

# Examine the overall data frame

# str() shows the number of observations, and the number, names, types and some values of columns

## str(titanic)

# You can retrieve and save the number of rows and number of coloumns of a data frame

nrow(titanic)
## [1] 891
row <- nrow(titanic)
row
## [1] 891
col <- ncol(titanic)
col
## [1] 12
# Show the head and tail rows of a data frame

head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
titanic[1:6,]
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
head(titanic, n=1)
##   PassengerId Survived Pclass                    Name  Sex Age SibSp Parch
## 1           1        0      3 Braund, Mr. Owen Harris male  22     1     0
##      Ticket Fare Cabin Embarked
## 1 A/5 21171 7.25              S
titanic[1,]
##   PassengerId Survived Pclass                    Name  Sex Age SibSp Parch
## 1           1        0      3 Braund, Mr. Owen Harris male  22     1     0
##      Ticket Fare Cabin Embarked
## 1 A/5 21171 7.25              S
head(titanic, n=10)
##    PassengerId Survived Pclass
## 1            1        0      3
## 2            2        1      1
## 3            3        1      3
## 4            4        1      1
## 5            5        0      3
## 6            6        0      3
## 7            7        0      1
## 8            8        0      3
## 9            9        1      3
## 10          10        1      2
##                                                   Name    Sex Age SibSp
## 1                              Braund, Mr. Owen Harris   male  22     1
## 2  Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                               Heikkinen, Miss. Laina female  26     0
## 4         Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                             Allen, Mr. William Henry   male  35     0
## 6                                     Moran, Mr. James   male  NA     0
## 7                              McCarthy, Mr. Timothy J   male  54     0
## 8                       Palsson, Master. Gosta Leonard   male   2     3
## 9    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female  27     0
## 10                 Nasser, Mrs. Nicholas (Adele Achem) female  14     1
##    Parch           Ticket    Fare Cabin Embarked
## 1      0        A/5 21171  7.2500              S
## 2      0         PC 17599 71.2833   C85        C
## 3      0 STON/O2. 3101282  7.9250              S
## 4      0           113803 53.1000  C123        S
## 5      0           373450  8.0500              S
## 6      0           330877  8.4583              Q
## 7      0            17463 51.8625   E46        S
## 8      1           349909 21.0750              S
## 9      2           347742 11.1333              S
## 10     0           237736 30.0708              C
tail(titanic, n = 10)
##     PassengerId Survived Pclass                                     Name
## 882         882        0      3                       Markun, Mr. Johann
## 883         883        0      3             Dahlberg, Miss. Gerda Ulrika
## 884         884        0      2            Banfield, Mr. Frederick James
## 885         885        0      3                   Sutehall, Mr. Henry Jr
## 886         886        0      3     Rice, Mrs. William (Margaret Norton)
## 887         887        0      2                    Montvila, Rev. Juozas
## 888         888        1      1             Graham, Miss. Margaret Edith
## 889         889        0      3 Johnston, Miss. Catherine Helen "Carrie"
## 890         890        1      1                    Behr, Mr. Karl Howell
## 891         891        0      3                      Dooley, Mr. Patrick
##        Sex Age SibSp Parch           Ticket    Fare Cabin Embarked
## 882   male  33     0     0           349257  7.8958              S
## 883 female  22     0     0             7552 10.5167              S
## 884   male  28     0     0 C.A./SOTON 34068 10.5000              S
## 885   male  25     0     0  SOTON/OQ 392076  7.0500              S
## 886 female  39     0     5           382652 29.1250              Q
## 887   male  27     0     0           211536 13.0000              S
## 888 female  19     0     0           112053 30.0000   B42        S
## 889 female  NA     1     2       W./C. 6607 23.4500              S
## 890   male  26     0     0           111369 30.0000  C148        C
## 891   male  32     0     0           370376  7.7500              Q
# summary() shows the mean and the five-number statistics indicating the spread of each column's values

## summary(titanic)

# Remove unique identifiers from further analysis as they are not interesting without additional feature extractions

titanic <- titanic[c(-1,-4,-9)]

# Change Survived and other nominal variables to factors

titanic$Survived <- factor(titanic$Survived)

titanic$Sex <- factor(titanic$Sex)

titanic$Pclass <- factor(titanic$Pclass)

titanic$Cabin <- factor(titanic$Cabin)

titanic$Embarked <- factor(titanic$Embarked)

str(titanic)
## 'data.frame':    891 obs. of  9 variables:
##  $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass  : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp   : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch   : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin   : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## summary(titanic)

# Remove observations with missing Age values. 
# This missing data handling approach has the obvious disadvantages of 
# the applicability of the model to data with missing age.
# To keep observations with missing Age values require careful imputation of Age missingness. 
# The various missing data imputation methods are beyond the knowledge required for this tutorial.

# Missing values in Age can cause problems. Test the following commands when you have time.
# var(titanic$Age)
# mean(titanic$Age)

# var(titanic[-which(is.na(titanic$Age)), ]$Age)
# sd(titanic[-which(is.na(titanic$Age)), ]$Age)

# Use which() command to identify and remove observations in which 
# the Age value is missing or is.na(titanic Age) = TRUE.

titanic <- titanic[-which(is.na(titanic$Age)), ]

## summary(titanic)

# On the other hand, keeping missing factor levels might be able to lead to meaningful models
# Empty level names of the Cabin and Embarked factors will cause problems in some analysis.
# Other missing factor value imputation remains a good option beyond the scope of this tutorial.

# levels() and sort() with factors

## str(titanic$Cabin)

## titanic$Cabin

# str() truncate the display of factor levels. levels() shows all of the factor levels of a factor variable

## levels(titanic$Cabin)

# It is useful to sort the levels based on how many observations contain a factor level.
# Remember table() that counts how many observations contain a factor level.
# Use sort to sort factor levels by the count of observations containing a level
# You can use sort inside barplot()

table(titanic$Cabin)
## 
##                             A10             A14             A16 
##             529               1               0               1 
##             A19             A20             A23             A24 
##               0               1               1               1 
##             A26             A31             A32             A34 
##               1               1               0               1 
##             A36              A5              A6              A7 
##               1               1               1               1 
##            B101            B102             B18             B19 
##               1               0               2               1 
##             B20             B22             B28              B3 
##               2               2               2               1 
##             B30             B35             B37             B38 
##               1               2               1               1 
##             B39              B4             B41             B42 
##               1               1               1               1 
##             B49              B5             B50     B51 B53 B55 
##               2               2               1               2 
## B57 B59 B63 B66         B58 B60             B69             B71 
##               2               2               1               1 
##             B73             B77             B78             B79 
##               1               2               0               1 
##             B80         B82 B84             B86             B94 
##               1               1               1               1 
##         B96 B98            C101            C103            C104 
##               4               1               1               1 
##            C106            C110            C111            C118 
##               0               1               1               1 
##            C123            C124            C125            C126 
##               2               1               2               1 
##            C128            C148              C2         C22 C26 
##               0               1               2               3 
##     C23 C25 C27             C30             C32             C45 
##               4               1               1               1 
##             C46             C47             C49             C50 
##               1               0               1               1 
##             C52             C54         C62 C64             C65 
##               1               1               1               2 
##             C68              C7             C70             C78 
##               2               1               1               2 
##             C82             C83             C85             C86 
##               1               2               1               1 
##             C87             C90             C91             C92 
##               1               1               1               1 
##             C93             C95             C99               D 
##               2               0               1               3 
##         D10 D12             D11             D15             D17 
##               1               1               1               2 
##             D19             D20             D21             D26 
##               1               2               0               2 
##             D28             D30             D33             D35 
##               1               1               2               2 
##             D36             D37             D45             D46 
##               2               1               0               1 
##             D47             D48             D49             D50 
##               1               1               1               1 
##             D56              D6              D7              D9 
##               1               1               1               1 
##             E10            E101             E12            E121 
##               1               2               1               2 
##             E17             E24             E25             E31 
##               1               2               2               1 
##             E33             E34             E36             E38 
##               1               1               1               1 
##             E40             E44             E46             E49 
##               1               2               1               1 
##             E50             E58             E63             E67 
##               1               1               1               2 
##             E68             E77              E8              F2 
##               1               1               2               3 
##             F33             F38              F4           F E69 
##               3               0               2               0 
##           F G63           F G73              G6               T 
##               1               2               4               1
sort(table(titanic$Cabin), decreasing = TRUE)
## 
##                         B96 B98     C23 C25 C27              G6 
##             529               4               4               4 
##         C22 C26               D              F2             F33 
##               3               3               3               3 
##             B18             B20             B22             B28 
##               2               2               2               2 
##             B35             B49              B5     B51 B53 B55 
##               2               2               2               2 
## B57 B59 B63 B66         B58 B60             B77            C123 
##               2               2               2               2 
##            C125              C2             C65             C68 
##               2               2               2               2 
##             C78             C83             C93             D17 
##               2               2               2               2 
##             D20             D26             D33             D35 
##               2               2               2               2 
##             D36            E101            E121             E24 
##               2               2               2               2 
##             E25             E44             E67              E8 
##               2               2               2               2 
##              F4           F G73             A10             A16 
##               2               2               1               1 
##             A20             A23             A24             A26 
##               1               1               1               1 
##             A31             A34             A36              A5 
##               1               1               1               1 
##              A6              A7            B101             B19 
##               1               1               1               1 
##              B3             B30             B37             B38 
##               1               1               1               1 
##             B39              B4             B41             B42 
##               1               1               1               1 
##             B50             B69             B71             B73 
##               1               1               1               1 
##             B79             B80         B82 B84             B86 
##               1               1               1               1 
##             B94            C101            C103            C104 
##               1               1               1               1 
##            C110            C111            C118            C124 
##               1               1               1               1 
##            C126            C148             C30             C32 
##               1               1               1               1 
##             C45             C46             C49             C50 
##               1               1               1               1 
##             C52             C54         C62 C64              C7 
##               1               1               1               1 
##             C70             C82             C85             C86 
##               1               1               1               1 
##             C87             C90             C91             C92 
##               1               1               1               1 
##             C99         D10 D12             D11             D15 
##               1               1               1               1 
##             D19             D28             D30             D37 
##               1               1               1               1 
##             D46             D47             D48             D49 
##               1               1               1               1 
##             D50             D56              D6              D7 
##               1               1               1               1 
##              D9             E10             E12             E17 
##               1               1               1               1 
##             E31             E33             E34             E36 
##               1               1               1               1 
##             E38             E40             E46             E49 
##               1               1               1               1 
##             E50             E58             E63             E68 
##               1               1               1               1 
##             E77           F G63               T             A14 
##               1               1               1               0 
##             A19             A32            B102             B78 
##               0               0               0               0 
##            C106            C128             C47             C95 
##               0               0               0               0 
##             D21             D45             F38           F E69 
##               0               0               0               0
barplot(sort(table(titanic$Cabin), decreasing = TRUE))

barplot(sort(table(titanic$Cabin), decreasing = FALSE))

# Fixing empty character level names for Cabin and Embarked

## levels(titanic$Cabin)[1]

levels(titanic$Cabin)[1] <- "missing"

levels(titanic$Embarked)
## [1] ""  "C" "Q" "S"
barplot(sort(table(titanic$Embarked), decreasing = FALSE))

levels(titanic$Embarked)[1]
## [1] ""
levels(titanic$Embarked)[1] <- "missing"

summary(titanic)
##  Survived Pclass      Sex           Age            SibSp       
##  0:424    1:186   female:261   Min.   : 0.42   Min.   :0.0000  
##  1:290    2:173   male  :453   1st Qu.:20.12   1st Qu.:0.0000  
##           3:355                Median :28.00   Median :0.0000  
##                                Mean   :29.70   Mean   :0.5126  
##                                3rd Qu.:38.00   3rd Qu.:1.0000  
##                                Max.   :80.00   Max.   :5.0000  
##                                                                
##      Parch             Fare                Cabin        Embarked  
##  Min.   :0.0000   Min.   :  0.00   missing    :529   missing:  2  
##  1st Qu.:0.0000   1st Qu.:  8.05   B96 B98    :  4   C      :130  
##  Median :0.0000   Median : 15.74   C23 C25 C27:  4   Q      : 28  
##  Mean   :0.4314   Mean   : 34.69   G6         :  4   S      :554  
##  3rd Qu.:1.0000   3rd Qu.: 33.38   C22 C26    :  3                
##  Max.   :6.0000   Max.   :512.33   D          :  3                
##                                    (Other)    :167

Code Chunk #2: Numerical variable summaries including basic functions, quantiles and boxplots.

### understanding a single variable: numerical variables

# Show summary of one or more columns

summary(titanic$Pclass)
##   1   2   3 
## 186 173 355
summary(titanic[c("Sex", "Age")])
##      Sex           Age       
##  female:261   Min.   : 0.42  
##  male  :453   1st Qu.:20.12  
##               Median :28.00  
##               Mean   :29.70  
##               3rd Qu.:38.00  
##               Max.   :80.00
# obtain the mean, median, max, min and range of a numeric variable

mean(titanic$Age)
## [1] 29.69912
median(titanic$Age)
## [1] 28
range(titanic$Age)
## [1]  0.42 80.00
# calculate the difference of the range

max.Age <- max(titanic$Age)
min.Age <- min(titanic$Age)
min.Age
## [1] 0.42
max.Age
## [1] 80
range.diff.Age <- max.Age - min.Age
range.diff.Age
## [1] 79.58
# min-max normalization of first observation's Age to a value between zero zna 1

titanic$Age[1]
## [1] 22
(titanic$Age[1] - min.Age)/range.diff.Age
## [1] 0.2711737
# diff() calculates differences between an attribute's values in the referenced record and that in the record following it
# diff(titanic$Age) 
# ?diff
# titanic$Age

# use quantile to calculate the five-number summary for Age

quantile(titanic$Age)
##     0%    25%    50%    75%   100% 
##  0.420 20.125 28.000 38.000 80.000
# the 1st percentile and the 99th percentile

quantile(titanic$Age, probs = c(0.01, 0.99))
##    1%   99% 
##  1.00 65.87
# quintiles and dectiles 

quantile(titanic$Age, seq(from = 0, to = 1, by = 0.20))
##    0%   20%   40%   60%   80%  100% 
##  0.42 19.00 25.00 31.80 41.00 80.00
quantile(titanic$Age, seq(from = 0, to = 1, by = 0.10))
##    0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100% 
##  0.42 14.00 19.00 22.00 25.00 28.00 31.80 36.00 41.00 50.00 80.00
# boxplot of a numeric variable

boxplot(titanic$Age, main="Boxplot of Age in the titanic data set",
        ylab="Age")

# practice quantile for another variable - SibSp

quantile(titanic$SibSp)
##   0%  25%  50%  75% 100% 
##    0    0    0    1    5
# the 1st percentile and the 99th percentile

quantile(titanic$SibSp, probs = c(0.01, 0.99))
##  1% 99% 
##   0   4
# quintiles and dectiles 

quantile(titanic$SibSp, seq(from = 0, to = 1, by = 0.20))
##   0%  20%  40%  60%  80% 100% 
##    0    0    0    0    1    5
quantile(titanic$SibSp, seq(from = 0, to = 1, by = 0.10))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##    0    0    0    0    0    0    0    1    1    1    5
# boxplot of a numeric variable

boxplot(titanic$SibSp, main="Boxplot of Sibsp in the titanic data set",
        ylab="Sibsp")

# IQR = 3rdQuintile - 1stQuintile

# maxline = 3rdQuintile + 1.5*IQR

# minline = 1stQuintile - 1.5*IQR

# one definition of outliers : value > maxline and value < minline are drawn as circles and dots 

# scatter plot of a numeric variable. It becomes too busy for a large number of observations

plot(titanic$SibSp)

# histograms of a numeric variable

hist(titanic$SibSp, main = "Histogram of Sibsp in the titanic data set",
     xlab = "Sibsp")

### For a right skewed distribution, the mean is typically greater than the median

hist(titanic$Age, main = "Histogram of Age in the titanic data set",
     xlab = "Age")

hist(titanic$Fare, main = "Histogram of Fare in the titanic data set",
     xlab = "Fare")

hist(titanic$Parch, main = "Histogram of Parch  in the titanic data set",
     xlab = "Parch")

# Search for and understand the meaning of skewed, bimodal and multimodal continuous distributions

# variance and standard deviation of a numeric varaible

var(titanic$Age)
## [1] 211.0191
sd(titanic$Age)
## [1] 14.5265
var(titanic$SibSp)
## [1] 0.8644973
sd(titanic$SibSp)
## [1] 0.9297835

Code Chunk #3: Exploration of factor variables including dotplots, barplots and a table function with an error.

### Exploring factor variables

# A factor's distinct values

is.factor(titanic$Survived)
## [1] TRUE
summary(titanic$Survived)
##   0   1 
## 424 290
nlevels(titanic$Survived)
## [1] 2
str(titanic$Survived)
##  Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
is.factor(as.character(titanic$Survived))
## [1] FALSE
summary(as.character(titanic$Survived))
##    Length     Class      Mode 
##       714 character character
# How many of observations have a factor value?

plot(titanic$Survived)

plot(titanic$Survived, main = "Plot of Survived in the titanic data set",
     xlab = "Survived")

Survived.table <- table(titanic$Survived)

Survived.table
## 
##   0   1 
## 424 290
str(Survived.table)
##  'table' int [1:2(1d)] 424 290
##  - attr(*, "dimnames")=List of 1
##   ..$ : chr [1:2] "0" "1"
barplot(Survived.table, main = "Plot of Survived in the titanic data set",
        xlab = "Survived")
# remember the difference between the input data structures to plot() and barplot()

# compute table proportions

# Run prop.table(titanic$Survived) to see the error in this command

prop.table(Survived.table)
## 
##         0         1 
## 0.5938375 0.4061625
Survived.prop <- prop.table(table(titanic$Survived))

Survived.prop 
## 
##         0         1 
## 0.5938375 0.4061625
# round decimals 

round(Survived.prop, digits = 2)
## 
##    0    1 
## 0.59 0.41
options(digits = 2)

prop.table(Survived.prop)
## 
##    0    1 
## 0.59 0.41
# Remeber to get help from Help Pane using "?"

Code Chunk #4: Exploring relationships between variables, scatter plots, boxplots, 3D scatterplots and parallel coordinate plots.

### Understand relationships of multiple variables

# cor,  boxplot, 2D scatter plot - plot, 3D scatter plot

# scatter plot: two numeric variables

plot(titanic[,5:6])

plot(titanic$SibSp, titanic$Parch)

# Generate correlation coefficients of two numeric variables in a 2x2 matrix
# cor(X,Y) lies between -1 and 1. zero means no correlation. 1 or -1 indicates full correlation
# positive value means positive correlation and negative values mean negative relationships
# Examine the components in the formulation for correlation coefficients
# cor(X,Y) = cov(X,Y)/(sd(X)*sd(Y)) 
# cov(X,Y) = E[X-E(X)]*E[Y-E(Y)]

cov(titanic[,c(5,6)])
##       SibSp Parch
## SibSp  0.86  0.30
## Parch  0.30  0.73
var(titanic[,c(5,6)])
##       SibSp Parch
## SibSp  0.86  0.30
## Parch  0.30  0.73
var(titanic[,5])
## [1] 0.86
sd(titanic[,5])
## [1] 0.93
var(titanic[5])
##       SibSp
## SibSp  0.86
sd(titanic[,2])
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
##   Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## [1] 0.84
cor(titanic[c("SibSp", "Parch")])
##       SibSp Parch
## SibSp  1.00  0.38
## Parch  0.38  1.00
cor(titanic[5:6])
##       SibSp Parch
## SibSp  1.00  0.38
## Parch  0.38  1.00
cor(titanic[,5:6])
##       SibSp Parch
## SibSp  1.00  0.38
## Parch  0.38  1.00
# Generate the correlation matrix of all numeric variables

cor(titanic[4:7])
##          Age SibSp Parch  Fare
## Age    1.000 -0.31 -0.19 0.096
## SibSp -0.308  1.00  0.38 0.138
## Parch -0.189  0.38  1.00 0.205
## Fare   0.096  0.14  0.21 1.000
# Generate 2D scatter plots and correlation coefficients

pairs(titanic[4:7])

## pairs.panels(titanic[-1])

## pairs.panels(titanic)

pairs.panels(titanic[,c('Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked')])

## Examine relationships between numeric variables and factors

# boxplot groups values of a numeric variable based on the values of a factor
boxplot(Age~Survived, data = titanic)

boxplot(Parch~Survived, data = titanic)

boxplot(SibSp~Survived, data = titanic)

boxplot(Fare~Survived, data = titanic)

# The aggregate function

# We can use the aggregate command to aggregate a numeric feature by a categorical one.

# The aggregate function has three parameters

# 1. The numeric value, e.g. sales, to be aggregated to find out, e.g., total of sales,
#   average of sales, number of sales (i.e. orders).

# 2. The set of categories, product_category and sales_region, on which you wish
#   to aggregate

# 3.The aggregation function (e.g., sum, mean, length) that you wish to use

aggregate(SibSp~Survived, summary, data = titanic)
##   Survived SibSp.Min. SibSp.1st Qu. SibSp.Median SibSp.Mean SibSp.3rd Qu.
## 1        0       0.00          0.00         0.00       0.53          1.00
## 2        1       0.00          0.00         0.00       0.49          1.00
##   SibSp.Max.
## 1       5.00
## 2       4.00
aggregate(Parch~Survived, summary, data = titanic)
##   Survived Parch.Min. Parch.1st Qu. Parch.Median Parch.Mean Parch.3rd Qu.
## 1        0       0.00          0.00         0.00       0.37          0.00
## 2        1       0.00          0.00         0.00       0.53          1.00
##   Parch.Max.
## 1       6.00
## 2       5.00
# scatter plot of numeric values and factor values

plot(titanic$Survived)

plot(titanic$Age)

plot(titanic$Age,titanic$Survived)

plot(titanic$SibSp,titanic$Parch, col=titanic$Survived, pch = as.numeric((titanic$Survived)))

with(titanic, plot(titanic$SibSp,titanic$Parch, col=Survived, pch = as.numeric(Survived)))

with(titanic, plot(titanic$SibSp,titanic$Parch, col=Survived, pch = as.numeric(Survived), 
                   main = "2d scatter plot of titanic data", sub = "SibSp vs Parch"))


palette()
## [1] "black"   "red"     "green3"  "blue"    "cyan"    "magenta" "yellow" 
## [8] "gray"
legend('topright', legend = levels(titanic$Survived),  col = 1:2, cex = 0.8, pch = 1:2)

# Generate 3D scatterplot

scatterplot3d(titanic$Sex,titanic$Age,titanic$SibSp, pch = as.numeric(titanic$Survived), main = "3D scatter plot of titanic data")

legend('topright', legend = levels(titanic$Survived),  cex = 0.8, pch = 1:2)

scatterplot3d(titanic$Fare,titanic$Pclass,titanic$Sex, pch = as.numeric(titanic$Survived), main = "3D scatter plot of titanic data")

legend('topright', legend = levels(titanic$Survived),  cex = 0.8, pch = 1:2)

titanicf <- subset(titanic, Sex == "female")
summary(titanicf)
##  Survived Pclass      Sex           Age         SibSp         Parch    
##  0: 64    1: 85   female:261   Min.   : 1   Min.   :0.0   Min.   :0.0  
##  1:197    2: 74   male  :  0   1st Qu.:18   1st Qu.:0.0   1st Qu.:0.0  
##           3:102                Median :27   Median :0.0   Median :0.0  
##                                Mean   :28   Mean   :0.6   Mean   :0.7  
##                                3rd Qu.:37   3rd Qu.:1.0   3rd Qu.:1.0  
##                                Max.   :63   Max.   :5.0   Max.   :6.0  
##                                                                        
##       Fare         Cabin        Embarked  
##  Min.   :  7   missing:171   missing:  2  
##  1st Qu.: 13   G6     :  4   C      : 61  
##  Median : 26   F33    :  3   Q      : 12  
##  Mean   : 48   B18    :  2   S      :186  
##  3rd Qu.: 58   B28    :  2                
##  Max.   :512   B35    :  2                
##                (Other): 77
titanicm <- subset(titanic, Sex == "male")
summary(titanicm)
##  Survived Pclass      Sex           Age         SibSp         Parch    
##  0:360    1:101   female:  0   Min.   : 0   Min.   :0.0   Min.   :0.0  
##  1: 93    2: 99   male  :453   1st Qu.:21   1st Qu.:0.0   1st Qu.:0.0  
##           3:253                Median :29   Median :0.0   Median :0.0  
##                                Mean   :31   Mean   :0.4   Mean   :0.3  
##                                3rd Qu.:39   3rd Qu.:1.0   3rd Qu.:0.0  
##                                Max.   :80   Max.   :5.0   Max.   :5.0  
##                                                                        
##       Fare             Cabin        Embarked  
##  Min.   :  0   missing    :358   missing:  0  
##  1st Qu.:  8   F2         :  3   C      : 69  
##  Median : 13   B51 B53 B55:  2   Q      : 16  
##  Mean   : 27   B96 B98    :  2   S      :368  
##  3rd Qu.: 28   C23 C25 C27:  2                
##  Max.   :512   D26        :  2                
##                (Other)    : 84
plot(titanicm$Age,titanicm$SibSp, pch = as.numeric(titanicm$Survived), main = "2D scatter plot of males' SibSp and Age")

legend('topright', legend = levels(titanicf$Survived),  cex = 0.8, pch = 1:2)

plot(titanicf$Fare,titanicf$Pclass, pch = as.numeric(titanicf$Survived), main = "2D scatter plot of titanic females' Pclass vs Fare")

legend('topright', legend = levels(titanicf$Survived),  cex = 0.8, pch = 1:2)

# Parallel plot and parallel coordinate plot to show relationships between numeric variables and factors

# Required libraries - lattice and MASS

parallelplot(~titanic[-1] | Survived, data = titanic, var.label = TRUE)

parallelplot(~titanic[4:7] | Survived, data = titanic, var.label = TRUE)

parallelplot(~titanic[4:5] | Survived, data = titanic, var.label = TRUE)

parallelplot(~titanic[c(4,6)] | Survived, data = titanic, var.label = TRUE)

parallelplot(~titanic[c(6,7)] | Survived, data = titanic, var.label = TRUE)

parallelplot(~titanic[c(5,6)] | Survived, data = titanic, var.label = TRUE)

palette()
## [1] "black"   "red"     "green3"  "blue"    "cyan"    "magenta" "yellow" 
## [8] "gray"
# legend() that works for other plots cannot display legend in the parallelplot() above

# Generate a parallelcoordinate plot

parcoord(titanic[4:7], col = titanic$Survived, var.label = TRUE)

parcoord(titanic[4:5], col = titanic$Survived, var.label = TRUE)

parcoord(titanic[c(4,6)], col = titanic$Survived, var.label = TRUE)

parcoord(titanic[c(4,7)], col = titanic$Survived, var.label = TRUE)

parcoord(titanic[c(5,6)], col = titanic$Survived, var.label = TRUE)

palette()
## [1] "black"   "red"     "green3"  "blue"    "cyan"    "magenta" "yellow" 
## [8] "gray"
legend('topright', legend = levels(titanic$Survived),  col = 1:2, cex = 0.8, pch = 1:2)

##### end of Titanic Data Exploration Tutorial